%load_ext autoreload
%autoreload 2
from Tools import *
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\pyproj\__init__.py:91: UserWarning: Valid PROJ data directory not found. Either set the path using the environmental variable PROJ_LIB or with `pyproj.datadir.set_data_dir`. warnings.warn(str(err))
stazioni = csv_to_gpd("data/step_1_data_output/stazioni_enriched", crs='3031')
clusters = csv_to_gpd("data/step_1_data_output/clusters", crs='3031')
staz_influence = np.load('data/step_3_data_output/staz_influence_score_corr.npy')
jans = [i for i in range(len(clusters)*2) if i % 2 == 0]
juls = [i for i in range(len(clusters)*2) if i % 2 == 1]
jans_staz_influence = staz_influence[jans, :]
juls_staz_influence = staz_influence[juls, :]
stazioni['jans_influences'] = np.sum(jans_staz_influence, axis=0)
stazioni['juls_influences'] = np.sum(juls_staz_influence, axis=0)
stazioni['influenced_by_clusters'] = (stazioni['jans_influences'] + stazioni['juls_influences']) / 2
stazioni
| ice_core | latitude | longitude | age | dust_conc_(ppb) | dust_conc_(ppb)_sd | acc_rate | dust_flux | altitude | geometry | distance_from_coastline(km) | nearest_cluster(num) | distance_to_nearest_cluster(km) | nearest_cropout(km) | cropout_area_within_500km | cropout_area_within_1000km | jans_influences | juls_influences | influenced_by_clusters | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Dome B | -77.0500 | 94.5500 | 2.0-11.7 kyr BP | 14.2 | 8 | 2.0 | 0.28 | 3650 | POINT (1408375.121 -112078.263) | 857.75 | 5 | 857.40 | 695.40 | 0.00 | 1216.26 | 0.0000 | 0.0000 | 0.00000 |
| 1 | VOSTOK-BH7 | -78.2800 | 106.4800 | 3.6-9.8 kyr BP | 17.8 | 8 | 2.0 | 0.36 | 3480 | POINT (1225197.054 -362454.750) | 1028.70 | 1 | 1035.97 | 918.02 | 0.00 | 698.30 | 0.0000 | 0.0000 | 0.00000 |
| 2 | EPICA-DC | -75.0600 | 123.2100 | 2.0-11.7 kyr BP | 14.3 | 8 | 2.7 | 0.39 | 3233 | POINT (1365573.936 -893946.981) | 864.30 | 1 | 960.60 | 886.72 | 0.00 | 4031.86 | 0.0000 | 4.1220 | 2.06100 |
| 3 | SOLARICE-DC | -75.0631 | 123.2480 | 3.3-4.5 kyr BP | 11.7 | 5 | 2.6 | 0.31 | 3233 | POINT (1364694.413 -894664.752) | 864.86 | 1 | 959.65 | 885.76 | 0.00 | 4226.02 | 0.0000 | 4.1360 | 2.06800 |
| 4 | DOME FUJI | -77.1901 | 39.4212 | 3.2-11.7 kyr BP | 20.0 | 11 | 2.7 | 0.54 | 3810 | POINT (887384.152 1079504.535) | 773.77 | 3 | 682.97 | 547.33 | 0.00 | 2621.77 | 0.0000 | 0.0000 | 0.00000 |
| 5 | EPICA-DML | -75.0000 | 0.0400 | 2.0-11.7 kyr BP | 25.0 | 14 | 6.0 | 1.50 | 2882 | POINT (1144.086 1638782.839) | 349.08 | 28 | 302.70 | 180.89 | 662.44 | 2540.52 | 9.8940 | 0.0000 | 4.94700 |
| 6 | TALDICE | -72.4900 | 159.1100 | 2.0-11.7 kyp BP | 25.1 | 10 | 7.0 | 1.76 | 2315 | POINT (683501.582 -1790851.524) | 174.98 | 14 | 193.16 | 33.56 | 3794.50 | 9906.88 | 110.2235 | 234.5135 | 172.36850 |
| 7 | TAYLOR DOME | -77.4747 | 158.4326 | 2.0-11.7 kyr BP | 21.0 | 14 | 6.0 | 1.18 | 2365 | POINT (502183.599 -1270482.492) | 103.02 | 24 | 132.10 | 37.22 | 8168.99 | 15376.44 | 79.5030 | 7.7885 | 43.64575 |
| 8 | DC ITASE | -75.0600 | 123.2100 | AD 1570-1800 | 8.0 | 4 | 2.5 | 0.20 | 3230 | POINT (1365573.936 -893946.981) | 864.30 | 1 | 960.60 | 886.72 | 0.00 | 4031.86 | 0.0000 | 4.1220 | 2.06100 |
| 9 | D4 ITASE | -75.3500 | 135.4900 | AD 1420-1700 | 9.0 | 4 | 2.0 | 0.19 | 2795 | POINT (1121749.967 -1141102.610) | 695.04 | 1 | 723.78 | 619.01 | 0.00 | 10203.59 | 0.0000 | 0.0000 | 0.00000 |
| 10 | MDP-A ITASE | -75.3200 | 145.5100 | AD 1620-1800 | 14.0 | 3 | 3.6 | 0.50 | 2455 | POINT (907972.472 -1321602.233) | 420.36 | 12 | 471.63 | 349.78 | 4140.11 | 11649.42 | 5.7275 | 9.9675 | 7.84750 |
stazioni['Flux [mg/m2/yr] (d<5 µm)'] = None
stazioni['Flux [mg/m2/yr] (5<=d<10 µm)'] = None
stazioni['Coarse/tot fraction [flux]'] = None
# TALDICE
stazioni.iloc[6, -3:] = [ 0.75, 0.35, 0.318]
#DC ITASE
stazioni.iloc[8, -3:] = [ 0.2, 0.03, 0.130]
#MDPNT
stazioni.iloc[10, -3:] = [ 0.5, 0.14, 0.219]
#D4 ITASE
stazioni.iloc[9, -3:] = [ 0.19, 0.05, 0.208]
#Vostok-bh7
stazioni.iloc[1, -3:] = [0.36, 0, 0]
#Epica-DML
stazioni.iloc[5, -3:] = [1.64, 0.17, 0.094]
stazioni_dust = stazioni[~stazioni['Coarse/tot fraction [flux]'].isna()]
stazioni_dust
| ice_core | latitude | longitude | age | dust_conc_(ppb) | dust_conc_(ppb)_sd | acc_rate | dust_flux | altitude | geometry | ... | distance_to_nearest_cluster(km) | nearest_cropout(km) | cropout_area_within_500km | cropout_area_within_1000km | jans_influences | juls_influences | influenced_by_clusters | Flux [mg/m2/yr] (d<5 µm) | Flux [mg/m2/yr] (5<=d<10 µm) | Coarse/tot fraction [flux] | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | VOSTOK-BH7 | -78.28 | 106.48 | 3.6-9.8 kyr BP | 17.8 | 8 | 2.0 | 0.36 | 3480 | POINT (1225197.054 -362454.750) | ... | 1035.97 | 918.02 | 0.00 | 698.30 | 0.0000 | 0.0000 | 0.0000 | 0.36 | 0 | 0 |
| 5 | EPICA-DML | -75.00 | 0.04 | 2.0-11.7 kyr BP | 25.0 | 14 | 6.0 | 1.50 | 2882 | POINT (1144.086 1638782.839) | ... | 302.70 | 180.89 | 662.44 | 2540.52 | 9.8940 | 0.0000 | 4.9470 | 1.64 | 0.17 | 0.094 |
| 6 | TALDICE | -72.49 | 159.11 | 2.0-11.7 kyp BP | 25.1 | 10 | 7.0 | 1.76 | 2315 | POINT (683501.582 -1790851.524) | ... | 193.16 | 33.56 | 3794.50 | 9906.88 | 110.2235 | 234.5135 | 172.3685 | 0.75 | 0.35 | 0.318 |
| 8 | DC ITASE | -75.06 | 123.21 | AD 1570-1800 | 8.0 | 4 | 2.5 | 0.20 | 3230 | POINT (1365573.936 -893946.981) | ... | 960.60 | 886.72 | 0.00 | 4031.86 | 0.0000 | 4.1220 | 2.0610 | 0.2 | 0.03 | 0.13 |
| 9 | D4 ITASE | -75.35 | 135.49 | AD 1420-1700 | 9.0 | 4 | 2.0 | 0.19 | 2795 | POINT (1121749.967 -1141102.610) | ... | 723.78 | 619.01 | 0.00 | 10203.59 | 0.0000 | 0.0000 | 0.0000 | 0.19 | 0.05 | 0.208 |
| 10 | MDP-A ITASE | -75.32 | 145.51 | AD 1620-1800 | 14.0 | 3 | 3.6 | 0.50 | 2455 | POINT (907972.472 -1321602.233) | ... | 471.63 | 349.78 | 4140.11 | 11649.42 | 5.7275 | 9.9675 | 7.8475 | 0.5 | 0.14 | 0.219 |
6 rows × 22 columns
from scipy.stats import pearsonr
print(
f"Pearson correlations with cluster_influence: [corr, p-value]\n\n\
Small (d<5 µm): \n\
Flux: {np.around(pearsonr(stazioni_dust.influenced_by_clusters, stazioni_dust['Flux [mg/m2/yr] (d<5 µm)']), 2)} \n\
\nBig (5<=d<10 µm):\n\
Flux: {np.around(pearsonr(stazioni_dust.influenced_by_clusters, stazioni_dust['Flux [mg/m2/yr] (5<=d<10 µm)']), 2)}\n\
\nRatio:\n\
{np.around(pearsonr(stazioni_dust.influenced_by_clusters, stazioni_dust['Coarse/tot fraction [flux]']), 2)}"
)
Pearson correlations with cluster_influence: [corr, p-value] Small (d<5 µm): Flux: [0.15 0.78] Big (5<=d<10 µm): Flux: [0.88 0.02] Ratio: [0.7 0.12]
from scipy.stats import shapiro, anderson, chisquare, kstest
from statsmodels.stats.diagnostic import lilliefors
import seaborn as sns
data = stazioni_dust['Flux [mg/m2/yr] (5<=d<10 µm)'].astype(float)
sns.boxplot(data)
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\seaborn\_decorators.py:36: FutureWarning: Pass the following variable as a keyword arg: x. From version 0.12, the only valid positional argument will be `data`, and passing other arguments without an explicit keyword will result in an error or misinterpretation. warnings.warn(
<AxesSubplot:xlabel='Flux [mg/m2/yr] (5<=d<10 µm)'>
sns.displot(data)
<seaborn.axisgrid.FacetGrid at 0x1ae68a89d80>
def test_norm(data):
##### Shapiro-Wilk Test
shapiro_p = round(shapiro(data).pvalue, 2)
##### Anderson-Darling Normality Test
##### Chisquare
_, chi_p = chisquare(data)
chi_p = round(chi_p, 2)
##### Kolmogorov-Smirnov
_, ks = kstest(data, 'norm')
ks = round(ks, 2)
##### Lilliefors
_, ll = lilliefors(data, 'norm')
ll = round(ll, 2)
print(f'{shapiro_p} -> Shapiro')
print(f'{chi_p} -> Chisquare')
print(f'{ks} -> Kolmogorov-Smirnov')
print(f'{ll} -> Lilliefors')
print('\nAnderson:')
def anderson_test(data):
res = anderson(data)
for i in range(len(res.critical_values)):
crit_val, sig_lev = res.critical_values[i], res.significance_level[i]
if res.statistic < crit_val:
print(f"probably normal {crit_val} critical value at level {sig_lev}")
else:
print(f"probably not normal {crit_val} critical value at level {sig_lev}")
anders = anderson_test(data)
fig = sns.distplot(data)
plt.show()
test_norm(data)
0.33 -> Shapiro 0.98 -> Chisquare 0.07 -> Kolmogorov-Smirnov 0.53 -> Lilliefors Anderson: probably normal 0.592 critical value at level 15.0 probably normal 0.675 critical value at level 10.0 probably normal 0.809 critical value at level 5.0 probably normal 0.944 critical value at level 2.5 probably normal 1.123 critical value at level 1.0
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
tutti i test confermano la normalità dei dati in input
# Poisson regression code
import statsmodels.api as sm
from sklearn.metrics import r2_score
stazioni_vars = stazioni_dust[['distance_from_coastline(km)', 'distance_to_nearest_cluster(km)',
'nearest_cropout(km)', 'cropout_area_within_500km',
'cropout_area_within_1000km', 'jans_influences',
'juls_influences', 'influenced_by_clusters']]
y = data.copy()
corr = stazioni_vars.corr()
sns.heatmap(corr)
plt.show()
le tre variabili 'distance_from_coastline(km)', 'distance_to_nearest_cluster(km)' e 'nearest_cropout(km)' sono collineari, ne scegliamo una, stessa cosa per l'area cropout presente entro i due raggi 500 e 1000 km e anche per le influenze
le tre variabili candidate per il modello finale sono le seguenti:
stazioni_vars = stazioni_vars[['distance_to_nearest_cluster(km)', 'cropout_area_within_1000km', 'influenced_by_clusters']]
corr = stazioni_vars.corr()
sns.heatmap(corr)
<AxesSubplot:>
non abbiamo nessuna collinearità, continuiamo l'analisi
stazioni_vars
| distance_to_nearest_cluster(km) | cropout_area_within_1000km | influenced_by_clusters | |
|---|---|---|---|
| 1 | 1035.97 | 698.30 | 0.0000 |
| 5 | 302.70 | 2540.52 | 4.9470 |
| 6 | 193.16 | 9906.88 | 172.3685 |
| 8 | 960.60 | 4031.86 | 2.0610 |
| 9 | 723.78 | 10203.59 | 0.0000 |
| 10 | 471.63 | 11649.42 | 7.8475 |
fig, axes = plt.subplots(figsize=(15,5), nrows=1, ncols=3)
axes[0].scatter(x=stazioni_vars['distance_to_nearest_cluster(km)'], y=y)
axes[1].scatter(x=stazioni_vars['cropout_area_within_1000km'], y=y)
axes[2].scatter(x=stazioni_vars['influenced_by_clusters'], y=y)
plt.show()
dall'analisi degli scatterplot delle variabili a disposizione contro la variabile target vediamo nei rispettivi plot: 1) distance_to_nearest_cluster(km) suggerisce una buona correlazione negativa, all'aumentare della distanza dai cluster il flux delle particelle sembra diminuire abbastanza linearmente 2) cropout_area_within_1000km è troppo sparsa, si vede una nuvola di punti, si decide di scartare la variabile 3) influenced_by_cluster ha una buona relazione come ci ha già detto il coefficiente di Pearson, ma la stazione TALDICE rappresenta un forte outlier ripsetto gli altri score bisognerà quindi vedere se trasformando i dati il punteggio del modello migliorerà
stazioni_vars = stazioni_vars[['distance_to_nearest_cluster(km)', 'influenced_by_clusters']].\
rename(columns={'distance_to_nearest_cluster(km)': 'cluster_distance',
'influenced_by_clusters': 'cluster_influence'})
def plot_regression(res, x, y):
fig, ax = plt.subplots(figsize=(8, 5))
if not hasattr(x, 'columns'):
x_pred = np.linspace(x.min(), x.max(), 2)
else:
x_pred = np.linspace(x.iloc[:,1].min(), x.iloc[:,1].max(), 2)
if not hasattr(x, 'columns'):
ax.scatter(x, y, alpha=1, color='tomato', s=75)
else:
ax.scatter(x.iloc[:,1], y, alpha=1, color='tomato', s=75)
fig.suptitle('Regression Model')
fig.tight_layout(pad=2)
ax.grid(True, alpha=0.5)
if not hasattr(x, 'columns'):
x_pred2 = x_pred
else:
x_pred2 = sm.add_constant(x_pred)
y_pred = res.predict(x_pred2)
ax.plot(x_pred, y_pred, color='brown', linewidth=2, alpha=0.5, ls='--')
y_hat = res.predict(x) # x is an array from line 12 above
y_err = y - y_hat
if not hasattr(x, 'columns'):
mean_x = x.T[1].mean()
else:
mean_x = x.iloc[:,1].T[1].mean()
n = len(x)
dof = n - res.df_model - 1
from scipy import stats
t = stats.t.ppf(1-0.025, df=dof)
s_err = np.sum(np.power(y_err, 2))
conf = t * np.sqrt((s_err/(n-2))*(1.0/n + (np.power((x_pred-mean_x),2) / ((np.sum(np.power(x_pred,2))) - n*(np.power(mean_x,2))))))
upper = y_pred + abs(conf)
lower = y_pred - abs(conf)
ax.fill_between(x_pred, lower, upper, color='#888888', alpha=0.3, label='confidence interval')
from statsmodels.sandbox.regression.predstd import wls_prediction_std
if hasattr(x, 'columns'):
sdev, lower, upper = wls_prediction_std(res, exog=x_pred2, alpha=0.05)
ax.fill_between(x_pred, lower, upper, color='#888888', alpha=0.1, label='prediction interval')
ax.legend(loc='upper left')
plt.show()
x = sm.add_constant(stazioni_vars['cluster_influence'])
y = data.values
mod = sm.OLS(
y,
x)
res = mod.fit()
print(res.summary())
plot_regression(res, x, y)
print(test_norm(res.resid))
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.773
Model: OLS Adj. R-squared: 0.716
Method: Least Squares F-statistic: 13.62
Date: Sun, 15 May 2022 Prob (F-statistic): 0.0210
Time: 22:32:25 Log-Likelihood: 8.7725
No. Observations: 6 AIC: -13.55
Df Residuals: 4 BIC: -13.96
Df Model: 1
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
const 0.0722 0.031 2.310 0.082 -0.015 0.159
cluster_influence 0.0016 0.000 3.691 0.021 0.000 0.003
==============================================================================
Omnibus: nan Durbin-Watson: 2.293
Prob(Omnibus): nan Jarque-Bera (JB): 0.519
Skew: 0.382 Prob(JB): 0.772
Kurtosis: 1.779 Cond. No. 78.6
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
warn("omni_normtest is not valid with less than 8 observations; %i "
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
0.74 -> Shapiro 1.0 -> Chisquare 0.1 -> Kolmogorov-Smirnov 0.67 -> Lilliefors Anderson: probably normal 0.592 critical value at level 15.0 probably normal 0.675 critical value at level 10.0 probably normal 0.809 critical value at level 5.0 probably normal 0.944 critical value at level 2.5 probably normal 1.123 critical value at level 1.0
None
I residui sono distribuiti normalmente secondo tutti i test, abbiamo R2 intorno al 71,6%, sicuramente un buon inizio, ma dal grafico del modello di regressione
vediamo la maggior parte dei record schiacciati a sinistra, valutiamo quindi delle trasformazioni che rendano più lineari i dati
avendo molti valori compresi tra 0 e 1 e un solo outlier la trasformazione più consona è la radice, in quanto avvicinerà i valori tra loro
esploriamo le possibilità
fig, axes = plt.subplots(figsize=(15, 5), nrows=1, ncols=3)
axes[0].scatter(x=np.power(stazioni_vars['cluster_influence'], 1/2), y=y)
axes[1].scatter(x=np.power(stazioni_vars['cluster_influence'], 1/3), y=y)
axes[2].scatter(x=np.power(stazioni_vars['cluster_influence'], 1/4), y=y)
<matplotlib.collections.PathCollection at 0x1ae6b398a00>
la radice quadrata migliora la situazione ma non è ancora del tutto lineare, la radice cubica sembra la migliore, la base 4 tende ad aprire un po' troppo i valori
controlliamo i modelli per vedere se il fitting aumenta e se la variabile rimane significativa come ci aspettiamo
x = sm.add_constant(np.power(stazioni_vars['cluster_influence'], 1/2))
y = data.values
mod = sm.OLS(
y,
x)
res = mod.fit()
print(res.summary())
plot_regression(res, x, y)
print(test_norm(res.resid))
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.874
Model: OLS Adj. R-squared: 0.842
Method: Least Squares F-statistic: 27.70
Date: Sun, 15 May 2022 Prob (F-statistic): 0.00624
Time: 22:32:27 Log-Likelihood: 10.534
No. Observations: 6 AIC: -17.07
Df Residuals: 4 BIC: -17.48
Df Model: 1
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
const 0.0441 0.026 1.711 0.162 -0.027 0.116
cluster_influence 0.0243 0.005 5.263 0.006 0.011 0.037
==============================================================================
Omnibus: nan Durbin-Watson: 2.425
Prob(Omnibus): nan Jarque-Bera (JB): 0.430
Skew: 0.432 Prob(JB): 0.807
Kurtosis: 2.013 Cond. No. 6.96
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
warn("omni_normtest is not valid with less than 8 observations; %i "
0.7 -> Shapiro 1.0 -> Chisquare 0.09 -> Kolmogorov-Smirnov 0.89 -> Lilliefors Anderson: probably normal 0.592 critical value at level 15.0 probably normal 0.675 critical value at level 10.0 probably normal 0.809 critical value at level 5.0 probably normal 0.944 critical value at level 2.5 probably normal 1.123 critical value at level 1.0
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
None
x = np.power(stazioni_vars['cluster_influence'], 1/2)
y = data.values
mod = sm.OLS(
y,
x)
res = mod.fit()
print(res.summary())
plot_regression(res, x, y)
print(test_norm(res.resid))
OLS Regression Results
=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.896
Model: OLS Adj. R-squared (uncentered): 0.875
Method: Least Squares F-statistic: 43.02
Date: Sun, 15 May 2022 Prob (F-statistic): 0.00124
Time: 22:32:27 Log-Likelihood: 8.8871
No. Observations: 6 AIC: -15.77
Df Residuals: 5 BIC: -15.98
Df Model: 1
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
cluster_influence 0.0289 0.004 6.559 0.001 0.018 0.040
==============================================================================
Omnibus: nan Durbin-Watson: 1.850
Prob(Omnibus): nan Jarque-Bera (JB): 0.486
Skew: 0.326 Prob(JB): 0.784
Kurtosis: 1.767 Cond. No. 1.00
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
warn("omni_normtest is not valid with less than 8 observations; %i "
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
0.66 -> Shapiro 0.99 -> Chisquare 0.08 -> Kolmogorov-Smirnov 0.53 -> Lilliefors Anderson: probably normal 0.592 critical value at level 15.0 probably normal 0.675 critical value at level 10.0 probably normal 0.809 critical value at level 5.0 probably normal 0.944 critical value at level 2.5 probably normal 1.123 critical value at level 1.0
None
x = np.power(stazioni_vars['cluster_influence'], 1/3)
y = data.values
mod = sm.OLS(
y,
x)
res = mod.fit()
print(res.summary())
plot_regression(res, x, y)
print(test_norm(res.resid))
OLS Regression Results
=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.948
Model: OLS Adj. R-squared (uncentered): 0.938
Method: Least Squares F-statistic: 91.44
Date: Sun, 15 May 2022 Prob (F-statistic): 0.000212
Time: 22:32:28 Log-Likelihood: 10.979
No. Observations: 6 AIC: -19.96
Df Residuals: 5 BIC: -20.17
Df Model: 1
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
cluster_influence 0.0647 0.007 9.562 0.000 0.047 0.082
==============================================================================
Omnibus: nan Durbin-Watson: 2.457
Prob(Omnibus): nan Jarque-Bera (JB): 0.285
Skew: -0.183 Prob(JB): 0.867
Kurtosis: 1.998 Cond. No. 1.00
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
warn("omni_normtest is not valid with less than 8 observations; %i "
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
0.79 -> Shapiro 0.97 -> Chisquare 0.09 -> Kolmogorov-Smirnov 0.86 -> Lilliefors Anderson: probably normal 0.592 critical value at level 15.0 probably normal 0.675 critical value at level 10.0 probably normal 0.809 critical value at level 5.0 probably normal 0.944 critical value at level 2.5 probably normal 1.123 critical value at level 1.0
None
x = np.power(stazioni_vars['cluster_influence'], 1/4)
y = data.values
mod = sm.OLS(
y,
x)
res = mod.fit()
print(res.summary())
plot_regression(res, x, y)
print(test_norm(res.resid))
OLS Regression Results
=======================================================================================
Dep. Variable: y R-squared (uncentered): 0.940
Model: OLS Adj. R-squared (uncentered): 0.928
Method: Least Squares F-statistic: 78.21
Date: Sun, 15 May 2022 Prob (F-statistic): 0.000307
Time: 22:32:28 Log-Likelihood: 10.537
No. Observations: 6 AIC: -19.07
Df Residuals: 5 BIC: -19.28
Df Model: 1
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
cluster_influence 0.0915 0.010 8.843 0.000 0.065 0.118
==============================================================================
Omnibus: nan Durbin-Watson: 3.031
Prob(Omnibus): nan Jarque-Bera (JB): 0.799
Skew: -0.885 Prob(JB): 0.671
Kurtosis: 2.745 Cond. No. 1.00
==============================================================================
Notes:
[1] R² is computed without centering (uncentered) since the model does not contain a constant.
[2] Standard Errors assume that the covariance matrix of the errors is correctly specified.
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
warn("omni_normtest is not valid with less than 8 observations; %i "
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms). warnings.warn(msg, FutureWarning)
0.5 -> Shapiro 0.25 -> Chisquare 0.09 -> Kolmogorov-Smirnov 0.59 -> Lilliefors Anderson: probably normal 0.592 critical value at level 15.0 probably normal 0.675 critical value at level 10.0 probably normal 0.809 critical value at level 5.0 probably normal 0.944 critical value at level 2.5 probably normal 1.123 critical value at level 1.0
None
fig, axes = plt.subplots(figsize=(15, 5), nrows=1, ncols=3)
axes[0].scatter(x=np.power(stazioni_vars['cluster_distance'], 1/2), y=y)
axes[1].scatter(x=np.power(stazioni_vars['cluster_distance'], 1/3), y=y)
axes[2].scatter(x=np.power(stazioni_vars['cluster_distance'], 1/4), y=y)
<matplotlib.collections.PathCollection at 0x1ae6bfa5930>
x = sm.add_constant(np.power(stazioni_vars, 1/3))
y = data.values
mod = sm.OLS(
y,
x)
res = mod.fit()
print(res.summary())
print(test_norm(res.resid))
OLS Regression Results
==============================================================================
Dep. Variable: y R-squared: 0.992
Model: OLS Adj. R-squared: 0.987
Method: Least Squares F-statistic: 192.1
Date: Sun, 15 May 2022 Prob (F-statistic): 0.000682
Time: 22:32:29 Log-Likelihood: 18.905
No. Observations: 6 AIC: -31.81
Df Residuals: 3 BIC: -32.43
Df Model: 2
Covariance Type: nonrobust
=====================================================================================
coef std err t P>|t| [0.025 0.975]
-------------------------------------------------------------------------------------
const 0.3842 0.063 6.084 0.009 0.183 0.585
cluster_distance -0.0387 0.007 -5.844 0.010 -0.060 -0.018
cluster_influence 0.0326 0.006 5.751 0.010 0.015 0.051
==============================================================================
Omnibus: nan Durbin-Watson: 3.538
Prob(Omnibus): nan Jarque-Bera (JB): 0.759
Skew: 0.011 Prob(JB): 0.684
Kurtosis: 1.257 Cond. No. 91.0
==============================================================================
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
0.26 -> Shapiro
1.0 -> Chisquare
0.07 -> Kolmogorov-Smirnov
0.27 -> Lilliefors
Anderson:
probably normal 0.592 critical value at level 15.0
probably normal 0.675 critical value at level 10.0
probably normal 0.809 critical value at level 5.0
probably normal 0.944 critical value at level 2.5
probably normal 1.123 critical value at level 1.0
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\statsmodels\stats\stattools.py:74: ValueWarning: omni_normtest is not valid with less than 8 observations; 6 samples were given.
warn("omni_normtest is not valid with less than 8 observations; %i "
C:\Users\mario\anaconda3\envs\tesi\lib\site-packages\seaborn\distributions.py:2619: FutureWarning: `distplot` is a deprecated function and will be removed in a future version. Please adapt your code to use either `displot` (a figure-level function with similar flexibility) or `histplot` (an axes-level function for histograms).
warnings.warn(msg, FutureWarning)
None
import plotly.express as px
import plotly.graph_objects as go
mesh_size = .1
x_min, x_max = x.cluster_distance.min(), x.cluster_distance.max()
y_min, y_max = x.cluster_influence.min(), x.cluster_influence.max()
xrange = np.linspace(x_min, x_max, 10)
yrange = np.linspace(y_min, y_max, 10)
xx, yy = np.meshgrid(xrange, yrange)
#Predict for all points on the mesh
exog = pd.core.frame.DataFrame({'Distance': xx.ravel(), 'Influence': yy.ravel()})
exog = sm.add_constant(exog)
pred = res.predict(exog = exog)
pred = pred.values.reshape(xx.shape)
#Plot the original point, then x1,x2 grid plane pushed up by z
#A surface is drawn using a surface that connects all the points.
fig = px.scatter_3d(x=x.cluster_distance, y=x.cluster_influence, z=y, width=900, height=800)
# fig.update_traces(marker=dict(size=5)
fig.update_layout(scene = dict(
xaxis = dict(
title='Distance from Cluster',
gridcolor="white",
showbackground=True,
zerolinecolor="white"),
yaxis = dict(
title='Influence from Clusters',
gridcolor="white",
showbackground=True,
zerolinecolor="white"),
zaxis = dict(
title = 'Flux',
gridcolor="white",
showbackground=True,
zerolinecolor="white"))
)
fig.add_traces(go.Surface(x=xrange, y=yrange, z=pred, name='pred_surface', opacity=0.5))
plt.tight_layout()
fig.show()
<Figure size 432x288 with 0 Axes>